引言

这里直接读取作者给定的第一个病人的Gene expression analysis: discovery patient PBMC,用的是 10x genomics 3’ Chromium expression assay

Following sequence alignment and filtering, a total of 12,874 cells were analyzed.

最后是 17,712 genes and 12,874 cells

载入必要的R包

需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat

因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。

参考:http://www.bio-info-trainee.com/3727.html

if (!requireNamespace("BiocManager"))
    install.packages("BiocManager")
if (!requireNamespace("Seurat"))
    BiocManager::install("Seurat")

加载R包

rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))

读入文章关于第一个病人的PBMC表达矩阵

start_time <- Sys.time()
raw_dataPBMC <- read.csv('../Output_2018-03-12/GSE117988_raw.expMatrix_PBMC.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time # for 2.7 GHz i5, 8G RAM, total = 4min
## Time difference of 1.482228 mins
dim(raw_dataPBMC) 
## [1] 17712 12874
start_time <- Sys.time()
dataPBMC <- log2(1 + sweep(raw_dataPBMC, 2, median(colSums(raw_dataPBMC))/colSums(raw_dataPBMC), '*')) # Normalization
end_time <- Sys.time()
end_time - start_time
## Time difference of 42.03681 secs
# 3.0版本取消了ExtractField
# 版本2 timePoints <- sapply(colnames(dataPBMC), function(x) ExtractField(x, 2, '[.]'))
timePoints <- sapply(colnames(dataPBMC), function(x) unlist(strsplit(x, "\\."))[2]) 

timePoints <-ifelse(timePoints == '1', 'PBMC_Pre', 
                    ifelse(timePoints == '2', 'PBMC_EarlyD27',
                           ifelse(timePoints == '3', 'PBMC_RespD376', 'PBMC_ARD614')))
table(timePoints)
## timePoints
##   PBMC_ARD614 PBMC_EarlyD27      PBMC_Pre PBMC_RespD376 
##          4516          1592          2082          4684

表达矩阵的质量控制

简单看看表达矩阵的性质,主要是基因数量,细胞数量;以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。

fivenum(apply(dataPBMC,1,function(x) sum(x>0) ))
## RP4-669L17.10         LAMB3         NAT10    AC093673.5         RPL21 
##             1             6            50           207         12102
boxplot(apply(dataPBMC,1,function(x) sum(x>0) ))

fivenum(apply(dataPBMC,2,function(x) sum(x>0) ))
## CAAGAAATCGATCCCT.2 GGCCGATTCCAGAGGA.3 TCAACGAAGAGCTGGT.3 
##                 10                321                395 
## TGCCAAAGTCTGCGGT.4 TCTGGAATCTATCGCC.3 
##                481               2865
hist(apply(dataPBMC,2,function(x) sum(x>0) ))

然后创建Seurat的对象

start_time <- Sys.time()
# 3.0版本Create Seurat object稍作改进
PBMC <- CreateSeuratObject(dataPBMC, 
                           min.cells = 1, min.features = 0, 
                           project = '10x_PBMC') # already normalized
PBMC # 17,712 genes and 12,874 cells
## An object of class Seurat 
## 17712 features across 12874 samples within 1 assay 
## Active assay: RNA (17712 features)
end_time <- Sys.time()
end_time - start_time 
## Time difference of 2.670036 secs
# Add meta.data (nUMI and timePoints)
# 3.0版本可以直接使用 object$name <- vector,当然也可以用AddMetaData
PBMC <- AddMetaData(object = PBMC, 
                    metadata = apply(raw_dataPBMC, 2, sum),
                    col.name = 'nUMI_raw')
PBMC <- AddMetaData(object = PBMC, metadata = timePoints, col.name = 'TimePoints')

一些质控

这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 TimePoints 属性,所以可以用来进行可视化。

这里是:‘TimePoints’

sce=PBMC
features=c("nFeature_RNA", "nUMI_raw")
VlnPlot(object = sce, 
        features = features, 
        group.by = 'TimePoints', ncol = 2)

# 3.0版本将GenePlot替换为FeatureScatter
# 版本2 GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")
FeatureScatter(sce,feature1 = "nUMI_raw",feature2 = "nFeature_RNA")

可以看看高表达量基因是哪些

# 3.0版本要将sce@raw.data替换成GetAssayData(object = , assay= ,slot = )
tail(sort(Matrix::rowSums(GetAssayData(sce,assay = "RNA"))))
##   EEF1A1   RPL13A    RPLP1   TMSB4X      B2M   MALAT1 
## 37203.12 38345.11 38977.99 43027.61 46854.95 62363.12
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(GetAssayData(sce,assay = "RNA")),decreasing = T))
# 版本2 GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])
FeatureScatter(object = sce, feature1 = tmp[1], feature2 = tmp[2])

# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞

# 3.0版本将CellPlot替换成CellScatter,sce@cell.names换为colnames
# 版本2 CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)
CellScatter(sce, colnames(sce)[3],colnames(sce)[4])

最后标准聚类可视化

This process consists of data normalization and variable feature selection, data scaling, a PCA on variable features, construction of a shared-nearest-neighbors graph, and clustering using a modularity optimizer. Finally, we use a t-SNE to visualize our clusters in a two-dimensional space.

start_time <- Sys.time()
# Cluster PBMC
PBMC <- ScaleData(object = PBMC, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
# 3.0版本将FindVariableGenes换为FindVariableFeatures,另外将原来的cutoff进行整合,x轴统一归到mean.cutoff中,y轴归到dispersion.cutoff中
# 版本2 PBMC <- FindVariableGenes(object = PBMC, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

PBMC <- FindVariableFeatures(object = PBMC, mean.function = ExpMean, dispersion.function = LogVMR, mean.cutoff = c(0.0125,3), dispersion.cutoff = c(0.5,Inf))

PBMC <- RunPCA(object = PBMC, pc.genes = VariableFeatures(PBMC))
## PC_ 1 
## Positive:  FTL, FTH1, S100A9, GPX1, CST3, LYZ, S100A8, SAT1, AIF1, PPBP 
##     LST1, FCN1, CTSS, CSTA, PF4, S100A12, GNG11, TYROBP, PSAP, NRGN 
##     SDPR, VCAN, COTL1, LGALS2, S100A6, RP11-1143G9.4, MAP3K7CL, NEAT1, HIST1H2AC, RGS18 
## Negative:  NKG7, IL32, GNLY, GZMA, HLA-A, UBB, CST7, MALAT1, GZMH, GZMB 
##     CTSW, FGFBP2, KLRB1, CD7, BTG1, HOPX, CMC1, CD247, LTB, KLRD1 
##     PRF1, CD3D, CXCR4, GZMM, TRBC2, TRBC1, TRAC, TRDC, CCL5, HBA1 
## PC_ 2 
## Positive:  S100A9, LYZ, S100A8, AIF1, FCN1, CTSS, LST1, S100A6, TYROBP, FTL 
##     CSTA, NEAT1, S100A11, S100A12, VCAN, LGALS2, RP11-1143G9.4, CST3, TYMP, PSAP 
##     LGALS1, SERPINA1, DUSP1, S100A4, FOS, MNDA, VIM, MAFB, CD14, MS4A6A 
## Negative:  PF4, GNG11, SDPR, HIST1H2AC, TUBB1, PPBP, ACRBP, RGS18, CLU, NRGN 
##     GP9, SPARC, MAP3K7CL, NGFRAP1, NCOA4, TSC22D1, TREML1, MMD, PTCRA, RUFY1 
##     PGRMC1, CLEC1B, C6orf25, HIST1H3H, CMTM5, ITGA2B, TMEM40, MYL9, TUBA4A, CCL5 
## PC_ 3 
## Positive:  HBB, HBA1, HBA2, ALAS2, CD79A, AHSP, HBD, SNCA, IGKC, IGHD 
##     SLC25A37, IGHM, HLA-DRA, TCL1A, CA1, CD79B, HBM, CD74, MS4A1, IGLC2 
##     HLA-DQA1, SLC25A39, VPREB3, LTB, BLVRB, HLA-DRB1, BANK1, BPGM, LINC00926, FCER2 
## Negative:  NKG7, B2M, CCL5, CST7, GNLY, GZMB, HLA-A, GZMA, FGFBP2, S100A4 
##     MALAT1, HCST, GZMH, CTSW, ACTB, IFITM2, PRF1, KLRB1, KLRD1, ACTG1 
##     CD247, GAPDH, CMC1, CD7, HOPX, KLRF1, SPON2, SRGN, TRDC, FCGR3A 
## PC_ 4 
## Positive:  HBA1, HBA2, HBB, ALAS2, AHSP, HBD, SNCA, SLC25A37, CA1, HBM 
##     BLVRB, UBB, SLC25A39, BPGM, MPP1, GMPR, HBQ1, S100A8, PDZK1IP1, S100A9 
##     S100A12, NCOA4, LYZ, VCAN, IFI27, RP11-1143G9.4, TYROBP, MYL4, FOS, CSTA 
## Negative:  CD74, CD79A, MALAT1, HLA-DRA, HLA-DPB1, CD37, IGHD, HLA-DRB1, BTG1, IGKC 
##     CD79B, HLA-DPA1, HLA-DQA1, IGHM, TCL1A, CXCR4, B2M, CD52, MS4A1, IGLC2 
##     LTB, HLA-DQB1, VPREB3, BANK1, PLPP5, FCER2, LINC00926, IGLC3, CD69, HLA-DOB 
## PC_ 5 
## Positive:  TRAC, IL7R, CD3D, IL32, LTB, GZMK, CD52, TRBC2, VIM, CD8A 
##     DUSP2, TRBC1, JUNB, MALAT1, B2M, S100A4, GAPDH, KLRG1, S100A12, TRGC2 
##     HLA-A, S100A6, S100A8, VCAN, CXCR4, RP11-1143G9.4, BTG1, FOS, S100A9, MIAT 
## Negative:  FCGR3A, HLA-DRA, GZMB, HLA-DPA1, HLA-DPB1, CD74, UBB, FGFBP2, HLA-DRB1, SPON2 
##     TYROBP, FCER1G, PRF1, KLRF1, HLA-DQA1, NKG7, CD79A, CD79B, RHOC, HBA1 
##     CST7, HBA2, ALAS2, HBB, HBD, IGHD, AHSP, IFITM3, CLIC3, IGFBP7
## 避免太多log日志被打印出来。

# 3.0版本将FindClusters拆分为FindNeighbors和FindClusters
# 版本2是一个函数
# PBMC <- FindClusters(object = PBMC, 
#                      reduction.type = "pca", 
#                      dims.use = 1:10, 
#                      resolution = 1, 
#                      print.output = 0,
#                      k.param = 35, save.SNN = TRUE) # 13 clusters

PBMC <- FindNeighbors(PBMC, reduction = "pca", dims = 1:10,
                      k.param = 35)
PBMC <- FindClusters(object = PBMC, 
                     resolution = 1, verbose=F) 

PBMC <- RunTSNE(object = PBMC, dims.use = 1:10)

DimPlot(PBMC, colors = c('green4', 'pink', '#FF7F00', 'orchid', '#99c9fb', 'dodgerblue2', 'grey30', 'yellow', 'grey60', 'grey', 'red', '#FB9A99', 'black'))

end_time <- Sys.time()
end_time - start_time # for 2.7 GHz i5, 8G RAM, total = 1.2h
## Time difference of 11.88596 mins
save(PBMC,file = 'patient1.PBMC.output.Rdata')
# 这个步骤输出文件 1.75G, 遂放弃!

最后,这 13 clusters要进行注释,才能发表,如下所示:

作者文章里面是Representative marker genes shown in Supplementary Fig. 7. 如下所示

可以看到作者对PBMC里面的细胞都挑选了一个基因就命名了。

显示运行环境

sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Seurat_3.0.2
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137        tsne_0.1-3          bitops_1.0-6       
##  [4] RColorBrewer_1.1-2  httr_1.4.0          sctransform_0.2.0  
##  [7] tools_3.5.3         R6_2.4.0            irlba_2.3.3        
## [10] KernSmooth_2.23-15  lazyeval_0.2.2      colorspace_1.4-1   
## [13] npsurv_0.4-0        withr_2.1.2         tidyselect_0.2.5   
## [16] gridExtra_2.3       compiler_3.5.3      plotly_4.9.0       
## [19] labeling_0.3        caTools_1.17.1.2    scales_1.0.0       
## [22] lmtest_0.9-37       ggridges_0.5.1      pbapply_1.4-0      
## [25] stringr_1.4.0       digest_0.6.19       rmarkdown_1.12     
## [28] R.utils_2.8.0       pkgconfig_2.0.2     htmltools_0.3.6    
## [31] bibtex_0.4.2        htmlwidgets_1.3     rlang_0.3.4        
## [34] zoo_1.8-5           jsonlite_1.6        ica_1.0-2          
## [37] gtools_3.8.1        dplyr_0.8.1         R.oo_1.22.0        
## [40] magrittr_1.5        Matrix_1.2-15       Rcpp_1.0.1         
## [43] munsell_0.5.0       ape_5.3             reticulate_1.12    
## [46] R.methodsS3_1.7.1   stringi_1.4.3       yaml_2.2.0         
## [49] gbRd_0.4-11         MASS_7.3-51.1       gplots_3.0.1.1     
## [52] Rtsne_0.15          plyr_1.8.4          grid_3.5.3         
## [55] parallel_3.5.3      gdata_2.18.0        listenv_0.7.0      
## [58] ggrepel_0.8.0       crayon_1.3.4        lattice_0.20-38    
## [61] cowplot_0.9.4       splines_3.5.3       SDMTools_1.1-221.1 
## [64] knitr_1.22          pillar_1.3.1        igraph_1.2.4       
## [67] future.apply_1.3.0  reshape2_1.4.3      codetools_0.2-16   
## [70] glue_1.3.1          evaluate_0.13       lsei_1.2-0         
## [73] metap_1.1           data.table_1.12.0   BiocManager_1.30.4 
## [76] png_0.1-7           Rdpack_0.11-0       gtable_0.3.0       
## [79] RANN_2.6.1          purrr_0.3.2         tidyr_0.8.3        
## [82] future_1.14.0       assertthat_0.2.1    ggplot2_3.1.0      
## [85] xfun_0.8            rsvd_1.0.1          survival_2.43-3    
## [88] viridisLite_0.3.0   tibble_2.1.1        cluster_2.0.7-1    
## [91] globals_0.12.4      fitdistrplus_1.0-14 ROCR_1.0-7